Palo Alto¶

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from math import sqrt
import warnings

warnings.filterwarnings("ignore")

data = pd.read_csv('EVChargingStationUsage.csv')
In [2]:
#occurrences of each unique station
station_counts = data['Station Name'].value_counts()
plt.figure(figsize=(15, 10))  
station_counts.plot(kind='barh', color='skyblue')  
plt.title('Usage Count of Individual Stations')
plt.xlabel('Usage Count')
plt.ylabel('Station Name')
plt.gca().invert_yaxis()  
plt.tight_layout()  
plt.show()
No description has been provided for this image
In [3]:
#function to extract the entity based on the first word following "PALO ALTO CA /"
def extract_entity(station_name):
    parts = station_name.split('/')
    if len(parts) > 1:
        # Take the part after "PALO ALTO CA /" and then take the first word
        return parts[1].strip().split(' ')[0]
    else:
        # If the station name does not follow the expected format, return the original
        return station_name

#function to create the 'Entity' column
data['Entity'] = data['Station Name'].apply(extract_entity)
#occurrences of each entity
entity_counts = data['Entity'].value_counts()

#plotting the usage count of each entity
plt.figure(figsize=(12, 6))
entity_counts.plot(kind='bar', color='skyblue')
plt.title('Usage Count of Each Station Entity')
plt.xlabel('Entity')
plt.ylabel('Usage Count')
plt.xticks(rotation=45)
plt.tight_layout()  
plt.show()
No description has been provided for this image
In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Function to create lagged features
def buildLaggedFeatures(s, lag=30, dropna=True):
    df = pd.concat([s.shift(i) for i in range(lag + 1)], axis=1)
    df.columns = ['lag_{}'.format(i) if i != 0 else s.name for i in range(lag + 1)]
    if dropna:
        df = df.dropna()
    return df

# Normalize station names
def normalize_station_name(name):
    name = name.upper()
    if "RINCONADA" in name:
        parts = name.split()
        new_parts = []
        for part in parts:
            if part.startswith('LIB') and len(parts) > parts.index(part) + 1 and parts[parts.index(part) + 1].isdigit():
                digit = parts.pop(parts.index(part) + 1)
                new_parts.append(part + ' #' + digit)
            else:
                new_parts.append(part)
        modified_name = ' '.join(new_parts)
        if 'RINCONADA LIB #' in modified_name:
            return 'RINCONADA LIB'
        return modified_name
    if "SHERMAN" in name:
        return None
    return name.split('#')[0].strip()

# Load data
data = pd.read_csv('EVChargingStationUsage.csv')

data['Normalized Station Name'] = data['Station Name'].apply(normalize_station_name)
data = data.dropna(subset=['Normalized Station Name'])

stations = data['Normalized Station Name'].unique()

# Set up the plot grid
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20, 15))
axes = axes.flatten()
fig.subplots_adjust(hspace=0.5, wspace=0.3)

# Processing each normalized station group
for idx, station in enumerate(stations):
    if idx >= 9:
        break
    station_data = data[data['Normalized Station Name'] == station]
    station_data['Start DateTime'] = pd.to_datetime(station_data['Start Date'] + ' ' + station_data['Start Time Zone'], errors='coerce')
    station_data.dropna(subset=['Start DateTime'], inplace=True)
    daily_energy = station_data.groupby(station_data['Start DateTime'].dt.floor('D'))['Energy (kWh)'].sum()
    lagged_features = buildLaggedFeatures(daily_energy, lag=30)
    train_data = lagged_features.iloc[-(150):-30]
    test_data = lagged_features.iloc[-30:]
    X_train = train_data.drop(columns=['Energy (kWh)'])
    y_train = train_data['Energy (kWh)']
    X_test = test_data.drop(columns=['Energy (kWh)'])
    y_test = test_data['Energy (kWh)']
    rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    print(f'RMSE for {station} Stations: {rmse}')
    ax = axes[idx]
    ax.plot(y_test.index, y_test, label='Actual', marker='o')
    ax.plot(y_test.index, y_pred, label='Forecasted', marker='x')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.xaxis.set_major_locator(mdates.DayLocator(interval=2))  # Set interval to 2 for every second day
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha="right")
    ax.set_title(f'Energy Consumption for {station}')
    ax.legend()

# Hide any unused axes if there are less than 9 stations
for ax in axes[len(stations):]:
    ax.set_visible(False)

plt.show()
RMSE for PALO ALTO CA / HAMILTON Stations: 32.61363452617447
RMSE for PALO ALTO CA / HIGH Stations: 33.610553449706984
RMSE for PALO ALTO CA / BRYANT Stations: 34.20994912491419
RMSE for PALO ALTO CA / MPL Stations: 34.07715427229769
RMSE for RINCONADA LIB Stations: 43.85210518068926
RMSE for PALO ALTO CA / WEBSTER Stations: 43.06912468664212
RMSE for PALO ALTO CA / TED THOMPSON Stations: 37.50685374748391
RMSE for PALO ALTO CA / CAMBRIDGE Stations: 51.12335097196068
No description has been provided for this image

Palo Alto - Offset¶

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Function to create lagged features
def buildLaggedFeatures_off(s, lag=30, offset=0, dropna=True):
    # Create a DataFrame with shifted data
    df = pd.concat([s.shift(i + offset) for i in range(lag + 1)], axis=1)

    # Adjust column names to reflect the lags, considering the offset
    df.columns = ['lag_{}'.format(i + offset) if i != 0 else s.name for i in range(lag + 1)]

    # Drop rows with missing values if requested
    if dropna:
        df = df.dropna()

    return df

# Normalize station names
def normalize_station_name(name):
    name = name.upper()
    if "RINCONADA" in name:
        parts = name.split()
        new_parts = []
        for part in parts:
            if part.startswith('LIB') and len(parts) > parts.index(part) + 1 and parts[parts.index(part) + 1].isdigit():
                digit = parts.pop(parts.index(part) + 1)
                new_parts.append(part + ' #' + digit)
            else:
                new_parts.append(part)
        modified_name = ' '.join(new_parts)
        if 'RINCONADA LIB #' in modified_name:
            return 'RINCONADA LIB'
        return modified_name
    if "SHERMAN" in name:
        return None
    return name.split('#')[0].strip()

# Load data
data = pd.read_csv('EVChargingStationUsage.csv')

data['Normalized Station Name'] = data['Station Name'].apply(normalize_station_name)
data = data.dropna(subset=['Normalized Station Name'])

stations = data['Normalized Station Name'].unique()

# Set up the plot grid
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20, 15))
axes = axes.flatten()
fig.subplots_adjust(hspace=0.5, wspace=0.3)

# Processing each normalized station group
for idx, station in enumerate(stations):
    if idx >= 9:
        break
    station_data = data[data['Normalized Station Name'] == station]
    station_data['Start DateTime'] = pd.to_datetime(station_data['Start Date'] + ' ' + station_data['Start Time Zone'], errors='coerce')
    station_data.dropna(subset=['Start DateTime'], inplace=True)
    daily_energy = station_data.groupby(station_data['Start DateTime'].dt.floor('D'))['Energy (kWh)'].sum()
    lagged_features = buildLaggedFeatures_off(daily_energy, lag=30, offset=7)
    train_data = lagged_features.iloc[:-30]
    # train_data = lagged_features.iloc[-(150):-30]  #error changes based on size of training by rmse=8 cca
    test_data = lagged_features.iloc[-30:]
    X_train = train_data.drop(columns=['Energy (kWh)'])
    y_train = train_data['Energy (kWh)']
    X_test = test_data.drop(columns=['Energy (kWh)'])
    y_test = test_data['Energy (kWh)']
    rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    print(f'RMSE for {station} Stations: {rmse}')
    ax = axes[idx]
    ax.plot(y_test.index, y_test, label='Actual', marker='o')
    ax.plot(y_test.index, y_pred, label='Forecasted', marker='x')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.xaxis.set_major_locator(mdates.DayLocator(interval=2))  # Set interval to 2 for every second day
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha="right")
    ax.set_title(f'Energy Consumption for {station}')
    ax.legend()

# Hide any unused axes if there are less than 9 stations
for ax in axes[len(stations):]:
    ax.set_visible(False)

plt.show()
RMSE for PALO ALTO CA / HAMILTON Stations: 28.544972824540345
RMSE for PALO ALTO CA / HIGH Stations: 33.06261022482949
RMSE for PALO ALTO CA / BRYANT Stations: 39.798684079619456
RMSE for PALO ALTO CA / MPL Stations: 33.974048618823794
RMSE for RINCONADA LIB Stations: 37.3178372979347
RMSE for PALO ALTO CA / WEBSTER Stations: 40.981397027088974
RMSE for PALO ALTO CA / TED THOMPSON Stations: 27.583233061876633
RMSE for PALO ALTO CA / CAMBRIDGE Stations: 52.92351466619919
No description has been provided for this image

Dundee¶

In [6]:
data = pd.read_csv('Dundee_merged (1).csv')
In [7]:
#occurrences of each unique station
station_counts = data['Address 1'].value_counts()
plt.figure(figsize=(15, 10))  
station_counts.plot(kind='barh', color='skyblue')  
plt.title('Usage Count of Individual Stations')
plt.xlabel('Usage Count')
plt.ylabel('Station Name')
plt.gca().invert_yaxis()  
plt.tight_layout()  
plt.show()
No description has been provided for this image
In [8]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming data has already been loaded into the 'data' DataFrame
# Filter out 'Trades Lane' and 'Sinclair Street' from the 'Address 1' column
data_filtered = data[~data['Address 1'].isin(['Trades Lane', 'Sinclair Street','Sinclair Street, Dundee'])]

# Get the occurrences of each unique station, now excluding the specified addresses
station_counts = data_filtered['Address 1'].value_counts()

# Plot the usage count of the remaining individual stations
plt.figure(figsize=(15, 10))
station_counts.plot(kind='barh', color='skyblue')
plt.title('Usage Count of Individual Stations')
plt.xlabel('Usage Count')
plt.ylabel('Station Name')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Function to create lagged features
def buildLaggedFeatures(s, lag=30, dropna=True):
    df = pd.concat([s.shift(i) for i in range(lag + 1)], axis=1)
    df.columns = ['lag_{}'.format(i) if i != 0 else s.name for i in range(lag + 1)]
    if dropna:
        df = df.dropna()
    return df

# Load data
data = pd.read_csv('Dundee_merged (1).csv')  # Ensure the path is correct

# Filter out specific stations
data_filtered = data[~data['Address 1'].isin(['Trades Lane', 'Sinclair Street', 'Sinclair Street, Dundee'])]

# Convert 'Start Date' to datetime
data_filtered['Start DateTime'] = pd.to_datetime(data_filtered['Start Date'], errors='coerce')
data_filtered.dropna(subset=['Start DateTime', 'Energy(kWh)'], inplace=True)

# Get unique stations after filtering
unique_stations = data_filtered['Address 1'].unique()

# Prepare to dynamically create subplots based on data availability
valid_data_stations = []  # To store stations with enough data

# Loop through each station and check data
for station in unique_stations:
    station_data = data_filtered[data_filtered['Address 1'] == station]
    daily_energy = station_data.groupby(station_data['Start DateTime'].dt.floor('D'))['Energy(kWh)'].sum()
    lagged_features = buildLaggedFeatures(daily_energy, lag=30)
    if lagged_features.shape[0] > 30:  # Ensuring enough data for split
        valid_data_stations.append(station)

# Number of plots
n_cols = 5
n_rows = (len(valid_data_stations) + n_cols - 1) // n_cols
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(n_cols * 5, n_rows * 4))  # Slightly increased size
axes = axes.flatten()

# Loop again to model and plot only where data is sufficient
for idx, station in enumerate(valid_data_stations):
    ax = axes[idx]
    station_data = data_filtered[data_filtered['Address 1'] == station]
    daily_energy = station_data.groupby(station_data['Start DateTime'].dt.floor('D'))['Energy(kWh)'].sum()
    lagged_features = buildLaggedFeatures(daily_energy, lag=30)
    train_data = lagged_features.iloc[-(150):-30]
    test_data = lagged_features.iloc[-30:]

    X_train = train_data.drop(columns=['Energy(kWh)'])
    y_train = train_data['Energy(kWh)']
    X_test = test_data.drop(columns=['Energy(kWh)'])
    y_test = test_data['Energy(kWh)']

    rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    print(f'RMSE for {station} Stations: {rmse}')
    # Set custom date ticks
    dates = y_test.index.tolist()
    ticks_to_use = [dates[0], dates[len(dates)//3], dates[2*len(dates)//3], dates[-1]]
    ax.set_xticks(ticks_to_use)

    ax.plot(y_test.index, y_test, label='Actual', marker='o')
    ax.plot(y_test.index, y_pred, label='Forecasted', marker='x')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.set_title(station, fontsize=10)
    ax.legend()

# Hide any unused axes
for j in range(len(valid_data_stations), len(axes)):
    axes[j].axis('off')

fig.tight_layout(pad=4.0)  # Adjusted padding for clarity
plt.show()
RMSE for Lochee Charging Hub, Dundee Stations: 125.73072340697016
RMSE for Greenmarket Multi Car Park, Dundee Stations: 41.03446318761495
RMSE for Queen Street Car Park, Broughty Ferry, Dundee Stations: 51.5228894107852
RMSE for Housing Office West, Dundee Stations: 10.076021070343195
RMSE for Nethergate, Dundee Stations: 12.443212563723245
RMSE for Brington Place Sheltered Housing, Dundee Stations: 4.413265415615367
RMSE for Balunie Drive, Dundee Stations: 12.25633817632058
RMSE for Social Work Building, Jack Martin Way, Dundee Stations: 7.5544917183593485
RMSE for Dundee Ice Arena, Dundee Stations: 54.43000965362863
RMSE for Mitchell Street, Dundee Stations: 5.743683789085189
RMSE for Oakland Day Centre, Dundee Stations: 7.957195877401366
RMSE for Dock Street, Dundee Stations: 20.95884889737276
RMSE for Whitfield Centre, Dundee Stations: 18.775805519187006
RMSE for Housing Office East, Dundee Stations: 15.81541397932199
RMSE for Gellatly Street Car Park, Dundee Stations: 23.397402274384525
RMSE for Dundee House, Dundee Stations: 263.2790289046673
RMSE for Public Works Dept, Clepington Rd. Dundee Stations: 25.321952059399305
RMSE for Marchbanks, Dundee Stations: 19.11664194555798
RMSE for Olympia Multi-Storey Car Park, Dundee Stations: 13.944673154243047
RMSE for South Tay Street, Dundee Stations: 20.119073641666752
RMSE for Ardler Complex, Dundee Stations: 10.701520355211217
RMSE for Menziehill House, Dundee Stations: 11.746132847083475
RMSE for Turriff House Rannoch Road, Dundee Stations: 9.516726429292797
RMSE for Trades Lane, Dundee Stations: 12.262027054013133
RMSE for University of Dundee, Nethergate, Dundee Stations: 12.498735394878022
RMSE for Janet Brougham House, Dundee Stations: 8.38937608866118
RMSE for South Tay Street Stations: 18.178915328626093
RMSE for Earn Cresent, Dundee Stations: 8.808114916049478
RMSE for DCC Environment, 34 Harefield Road Stations: 15.054936091461833
No description has been provided for this image

Dundee - Offset¶

In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Function to create lagged features
def buildLaggedFeatures_off(s, lag=30, offset=0, dropna=True):
    # Create a DataFrame with shifted data
    df = pd.concat([s.shift(i + offset) for i in range(lag + 1)], axis=1)

    # Adjust column names to reflect the lags, considering the offset
    df.columns = ['lag_{}'.format(i + offset) if i != 0 else s.name for i in range(lag + 1)]

    # Drop rows with missing values if requested
    if dropna:
        df = df.dropna()

    return df

# Load data
data = pd.read_csv('Dundee_merged (1).csv')  # Ensure the path is correct

# Filter out specific stations
data_filtered = data[~data['Address 1'].isin(['Trades Lane', 'Sinclair Street', 'Sinclair Street, Dundee'])]

# Convert 'Start Date' to datetime
data_filtered['Start DateTime'] = pd.to_datetime(data_filtered['Start Date'], errors='coerce')
data_filtered.dropna(subset=['Start DateTime', 'Energy(kWh)'], inplace=True)

# Get unique stations after filtering
unique_stations = data_filtered['Address 1'].unique()

# Prepare to dynamically create subplots based on data availability
valid_data_stations = []  # To store stations with enough data

# Loop through each station and check data
for station in unique_stations:
    station_data = data_filtered[data_filtered['Address 1'] == station]
    daily_energy = station_data.groupby(station_data['Start DateTime'].dt.floor('D'))['Energy(kWh)'].sum()
    lagged_features = buildLaggedFeatures_off(daily_energy, lag=30,offset=7)
    if lagged_features.shape[0] > 30:  # Ensuring enough data for split
        valid_data_stations.append(station)

# Number of plots
n_cols = 5
n_rows = (len(valid_data_stations) + n_cols - 1) // n_cols
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(n_cols * 5, n_rows * 4))  # Slightly increased size
axes = axes.flatten()

# Loop again to model and plot only where data is sufficient
for idx, station in enumerate(valid_data_stations):
    ax = axes[idx]
    station_data = data_filtered[data_filtered['Address 1'] == station]
    daily_energy = station_data.groupby(station_data['Start DateTime'].dt.floor('D'))['Energy(kWh)'].sum()
    lagged_features = buildLaggedFeatures_off(daily_energy, lag=30, offset=7)
    train_data = lagged_features.iloc[:-30]
    test_data = lagged_features.iloc[-30:]

    X_train = train_data.drop(columns=['Energy(kWh)'])
    y_train = train_data['Energy(kWh)']
    X_test = test_data.drop(columns=['Energy(kWh)'])
    y_test = test_data['Energy(kWh)']

    rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    print(f'RMSE for {station} Stations: {rmse}')
    # Set custom date ticks
    dates = y_test.index.tolist()
    ticks_to_use = [dates[0], dates[len(dates)//3], dates[2*len(dates)//3], dates[-1]]
    ax.set_xticks(ticks_to_use)

    ax.plot(y_test.index, y_test, label='Actual', marker='o')
    ax.plot(y_test.index, y_pred, label='Forecasted', marker='x')
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.set_title(station, fontsize=10)
    ax.legend()

# Hide any unused axes
for j in range(len(valid_data_stations), len(axes)):
    axes[j].axis('off')

fig.tight_layout(pad=4.0)  # Adjusted padding for clarity
plt.show()
RMSE for Lochee Charging Hub, Dundee Stations: 125.37137497879347
RMSE for Greenmarket Multi Car Park, Dundee Stations: 41.834306202468724
RMSE for Queen Street Car Park, Broughty Ferry, Dundee Stations: 45.201711295329915
RMSE for Housing Office West, Dundee Stations: 8.974539660023238
RMSE for Nethergate, Dundee Stations: 11.976774163911305
RMSE for Brington Place Sheltered Housing, Dundee Stations: 4.423448317960397
RMSE for Balunie Drive, Dundee Stations: 11.778013482105825
RMSE for Social Work Building, Jack Martin Way, Dundee Stations: 7.627394181354032
RMSE for Dundee Ice Arena, Dundee Stations: 53.10681582086938
RMSE for Mitchell Street, Dundee Stations: 5.162411034164818
RMSE for Oakland Day Centre, Dundee Stations: 8.674263053232051
RMSE for Dock Street, Dundee Stations: 20.425357605117792
RMSE for Whitfield Centre, Dundee Stations: 20.417607196486074
RMSE for Housing Office East, Dundee Stations: 15.751217556313966
RMSE for Gellatly Street Car Park, Dundee Stations: 25.90312299511663
RMSE for Dundee House, Dundee Stations: 262.17560758434223
RMSE for Public Works Dept, Clepington Rd. Dundee Stations: 20.52536201045445
RMSE for Marchbanks, Dundee Stations: 17.4355007383499
RMSE for Olympia Multi-Storey Car Park, Dundee Stations: 12.767447010750951
RMSE for South Tay Street, Dundee Stations: 19.786499235523195
RMSE for Ardler Complex, Dundee Stations: 10.950630858828793
RMSE for Menziehill House, Dundee Stations: 11.715689098711467
RMSE for Turriff House Rannoch Road, Dundee Stations: 7.613027997058728
RMSE for Trades Lane, Dundee Stations: 10.000442342393494
RMSE for University of Dundee, Nethergate, Dundee Stations: 12.66127764196015
RMSE for Janet Brougham House, Dundee Stations: 8.403209720735685
RMSE for South Tay Street Stations: 18.736883805267972
RMSE for Earn Cresent, Dundee Stations: 9.479696697539078
RMSE for DCC Environment, 34 Harefield Road Stations: 15.234425523804084
No description has been provided for this image

Perth¶

In [11]:
data = pd.read_csv('Perth&Kinross_merged1.csv')
In [12]:
#occurrences of each unique station
station_counts = data['Adress 1'].value_counts()
plt.figure(figsize=(15, 10))  
station_counts.plot(kind='barh', color='skyblue')  
plt.title('Usage Count of Individual Stations')
plt.xlabel('Usage Count')
plt.ylabel('Station Name')
plt.gca().invert_yaxis()  
plt.tight_layout()  
plt.show()
No description has been provided for this image
In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Function to create lagged features
def buildLaggedFeatures(s, lag=30, dropna=True):
    df = pd.concat([s.shift(i) for i in range(lag + 1)], axis=1)
    df.columns = ['lag_{}'.format(i) if i != 0 else s.name for i in range(lag + 1)]
    if dropna:
        df = df.dropna()
    return df

# Load the new dataset
data = pd.read_csv('Perth&Kinross_merged1.csv')  # Update the path to your dataset

# Convert 'Start Date' to datetime, assuming 'Start Date' is the column name
data['Start DateTime'] = pd.to_datetime(data['Start Date'], errors='coerce')
data.dropna(subset=['Start DateTime', 'Energy(kWh)'], inplace=True)

# Get unique stations after filtering
unique_stations = data['Adress 1'].unique()

# Number of plots
n_cols = 5
n_rows = (len(unique_stations) + n_cols - 1) // n_cols
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(n_cols * 5, n_rows * 4))
axes = axes.flatten()

# Loop through each station and plot
for idx, station in enumerate(unique_stations):
    ax = axes[idx]
    station_data = data[data['Adress 1'] == station]
    daily_energy = station_data.groupby(station_data['Start DateTime'].dt.floor('D'))['Energy(kWh)'].sum()
    lagged_features = buildLaggedFeatures(daily_energy, lag=30)

    if len(lagged_features) > 30:
        train_data = lagged_features.iloc[:-30]
        test_data = lagged_features.iloc[-30:]

        X_train = train_data.drop('Energy(kWh)', axis=1)
        y_train = train_data['Energy(kWh)']
        X_test = test_data.drop('Energy(kWh)', axis=1)
        y_test = test_data['Energy(kWh)']

        rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_regressor.fit(X_train, y_train)
        y_pred = rf_regressor.predict(X_test)

        ax.plot(y_test.index, y_test, label='Actual', marker='o')
        ax.plot(y_test.index, y_pred, label='Forecasted', marker='x')
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    else:
        # Display message if not enough data but keep the plot clean
        ax.text(0.5, 0.5, 'Not enough data', horizontalalignment='center', verticalalignment='center', fontsize=12, transform=ax.transAxes)
    
    ax.set_title(station, fontsize=10)
    ax.legend()

# Hide any unused axes if there are less than planned
for j in range(idx + 1, len(axes)):
    axes[j].axis('off')

fig.tight_layout(pad=4.0)  # Adjusted padding for clarity
plt.show()
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No description has been provided for this image